library(Seurat)
library(SnapATAC)
library(tidyverse)
library(DescTools)  # 4 AUC function
library(glue)
library(ggalluvial)  # 4 river plot
library(ggpubr)
source("~/multiOmic_benchmark/utils.R")
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
## Make output directory
outdir <- "~/multiOmic_benchmark/report/output/20191106_labelTransferEDA_F74/"
ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
[1] FALSE
integrate_features <- scan("~/models/intFeatures_union_hvg_2000_F74_SCElist_20191101.txt", what='')
Read 3027 items

Embeddings

Visualize label transfer on original ATAC data (embedded SnapATAC bins)

## Load original data
orig.ATAC <- readRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")
sce.list <- readRDS("~/my_data/integrated_thymus/F74_SCElist_20191101.RDS")
orig.RNA <- sce.list$RNA
## Make SeuratObjects
atac.seu <- snapToSeurat(
    obj=orig.ATAC, 
    eigs.dims=1:20, 
    norm=TRUE,
    scale=TRUE
    )
Epoch: checking input parameters ... 
Non-unique features (rownames) present in the input matrix, making uniquePerforming log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix

  |                                                                                       
  |                                                                                 |   0%
  |                                                                                       
  |==                                                                               |   3%
  |                                                                                       
  |=====                                                                            |   6%
  |                                                                                       
  |=======                                                                          |   9%
  |                                                                                       
  |==========                                                                       |  12%
  |                                                                                       
  |============                                                                     |  15%
  |                                                                                       
  |==============                                                                   |  18%
  |                                                                                       
  |=================                                                                |  21%
  |                                                                                       
  |===================                                                              |  24%
  |                                                                                       
  |=====================                                                            |  26%
  |                                                                                       
  |========================                                                         |  29%
  |                                                                                       
  |==========================                                                       |  32%
  |                                                                                       
  |=============================                                                    |  35%
  |                                                                                       
  |===============================                                                  |  38%
  |                                                                                       
  |=================================                                                |  41%
  |                                                                                       
  |====================================                                             |  44%
  |                                                                                       
  |======================================                                           |  47%
  |                                                                                       
  |========================================                                         |  50%
  |                                                                                       
  |===========================================                                      |  53%
  |                                                                                       
  |=============================================                                    |  56%
  |                                                                                       
  |================================================                                 |  59%
  |                                                                                       
  |==================================================                               |  62%
  |                                                                                       
  |====================================================                             |  65%
  |                                                                                       
  |=======================================================                          |  68%
  |                                                                                       
  |=========================================================                        |  71%
  |                                                                                       
  |============================================================                     |  74%
  |                                                                                       
  |==============================================================                   |  76%
  |                                                                                       
  |================================================================                 |  79%
  |                                                                                       
  |===================================================================              |  82%
  |                                                                                       
  |=====================================================================            |  85%
  |                                                                                       
  |=======================================================================          |  88%
  |                                                                                       
  |==========================================================================       |  91%
  |                                                                                       
  |============================================================================     |  94%
  |                                                                                       
  |===============================================================================  |  97%
  |                                                                                       
  |=================================================================================| 100%
atac.seu <- RenameCells(atac.seu, new.names = orig.ATAC@metaData$barcode)
## Add cell type predictions
getPredictedLabels <- function(seu.int, int.name, id.col="predicted.id", score.col="score"){
  pred.df <- seu.int$ATAC@meta.data[,c(id.col, score.col), drop=F] 
  rownames(pred.df) <- str_remove(rownames(pred.df), "^ATAC_")
  colnames(pred.df) <- c(str_c("predicted.id", "_", int.name), str_c("score", "_", int.name))
  pred.df
  }
pred.cca <- getPredictedLabels(seu.cca, "CCA", score.col = "prediction.score.max")
pred.liger <- getPredictedLabels(seu.liger, "Liger")
pred.conos <- getPredictedLabels(seu.conos, "Conos")
if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
  atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca, pred.liger, pred.conos))
} else {
  stop("Non corresponding cell names")
}

pl <-     DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Liger", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Liger")
plotly::ggplotly(pl)
orig.RNA.seu <- as.Seurat(orig.RNA)
orig.RNA.seu <- FindVariableFeatures(orig.RNA.seu)
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
orig.RNA.seu <- ScaleData(orig.RNA.seu)
Centering and scaling data matrix

  |                                                                                                                                              
  |                                                                                                                                        |   0%
  |                                                                                                                                              
  |====================================================================                                                                    |  50%
  |                                                                                                                                              
  |========================================================================================================================================| 100%
orig.RNA.seu <- RunPCA(orig.RNA.seu)
PC_ 1 
Positive:  TRBC2, TRBC1, HMGA1, HIST1H1C, HIST1H3H, ITM2A, HIST1H2BJ, SMIM24, TRAV13-2, FXYD2 
       TRBV7-2, PCGF5, HIST1H2BH, IL32, HIST1H2BN, CHAC1, RASD1, TRBV27, TRAV13-1, TRAV8-2 
       TRAV8-4, PTPN6, SELL, HIST1H2BG, TAGAP, TRDC, TRAV38-2DV8, TRAV29DV5, TRBV9, TRAV41 
Negative:  CALD1, COL5A2, COL6A1, COL6A2, SPARC, THY1, DCN, COL3A1, NFIB, SPARCL1 
       TSHZ2, CPE, PLAC9, NID1, FKBP10, PTN, FLRT2, MAP1B, EFEMP2, BGN 
       CXCL12, RBP1, LAMB1, AHNAK, COL1A1, COL5A1, FSTL1, LUM, LAMA4, MDK 
PC_ 2 
Positive:  SFRP1, NTRK2, PLAT, ISLR, NRK, SCARA5, ASPN, OSR1, OLFML3, MXRA8 
       CAPN6, PTPRD, PLP1, TMEFF2, CREB3L1, DKK3, CERCAM, MMP2, EBF2, SMOC2 
       CDO1, COL12A1, PDGFRA, LRRC17, THBS2, HTRA3, SFRP2, ANGPTL1, MAB21L1, MXRA5 
Negative:  MKI67, CDK1, NUSAP1, TOP2A, CCNA2, RRM2, UBE2C, BIRC5, KIFC1, TYMS 
       UBE2T, AURKB, CENPF, CENPM, CDCA8, TACC3, NCAPG, TPX2, ASF1B, CDKN3 
       GTSE1, CDCA3, HJURP, SPC25, MAD2L1, CDC20, PLK1, DLGAP5, NUF2, KIF22 
PC_ 3 
Positive:  CCNA2, NUF2, GTSE1, CDCA8, UBE2T, AURKB, PBK, CDK1, NCAPG, NDC80 
       KIFC1, CDCA3, HJURP, MAD2L1, SPC25, PLK1, KIF15, DEPDC1B, BIRC5, CDCA5 
       CKS1B, RRM2, DLGAP5, HMMR, CENPA, KIF22, KIF20A, CDCA2, CENPF, KIF2C 
Negative:  HLA-DRB5, HLA-DRA, TYROBP, HLA-DRB1, HLA-DPA1, HLA-DPB1, HLA-DQA1, C1QC, C1QB, RNASE1 
       HLA-DQB1, A2M, C1QA, CSF1R, STAB1, HCK, HLA-DMB, FOLR2, SAMHD1, LYZ 
       MS4A7, SPI1, CD36, MS4A6A, CYBB, TMEM176B, CD74, IGSF6, MPEG1, MS4A4A 
PC_ 4 
Positive:  GYPA, GYPB, NFE2, GYPE, ANK1, SLC4A1, RHAG, AHSP, DMTN, KLF1 
       GATA1, TMOD1, TMEM56, HBZ, HBG1, GMPR, C17orf99, SMIM5, HBQ1, TSPO2 
       ALAS2, PHOSPHO1, CR1L, TRIM58, HBM, EPB42, RHD, RHCE, SPTA1, SMIM1 
Negative:  TMSB10, TRBC2, CCL21, IL32, CXCL13, TRBC1, APLNR, CCL19, MIF, HMGB1 
       COX4I2, COL4A1, COL15A1, MADCAM1, FDCSP, CCL17, NTS, TNC, CDH5, FABP4 
       CAV1, COL4A2, CRIP2, RGS5, KLRB1, MYLK, IFITM1, IL33, PAPLN, HPN 
PC_ 5 
Positive:  APLNR, CDH5, CAV1, COL15A1, CRIP2, COL4A1, CCL21, KDR, CLDN5, MADCAM1 
       FABP4, IL33, COL4A2, CXCL13, TM4SF1, PODXL, CCL19, COX4I2, ESAM, BCAM 
       NTS, PAPLN, SPNS2, MYLK, C8orf4, ADGRF5, TM4SF18, TGM2, RP11-536O18.1, CXorf36 
Negative:  CSF1R, MS4A4A, CYBB, PLD4, MS4A6A, CD163, HCK, IGSF6, FOLR2, ADAP2 
       CD86, MS4A7, MARCH1, MRC1, F13A1, MPEG1, CD14, SPI1, HLA-DMB, GPR34 
       CD33, TGFBI, CLEC7A, TIMD4, CEBPA, SIGLEC1, CSF2RA, SLC15A3, LY86, AGR2 
orig.RNA.seu <- RunUMAP(orig.RNA.seu, dims=1:30)
13:04:19 UMAP embedding parameters a = 0.9922 b = 1.112
13:04:19 Read 8321 rows and found 30 numeric columns
13:04:19 Using Annoy for neighbor search, n_neighbors = 30
13:04:19 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:04:21 Writing NN index file to temp file /tmp/Rtmpc3mqss/file9a4b15e3cd
13:04:21 Searching Annoy index using 1 thread, search_k = 3000
13:04:24 Annoy recall = 100%
13:04:27 Commencing smooth kNN distance calibration using 1 thread
13:04:30 Initializing from normalized Laplacian + noise
13:04:30 Commencing optimization for 500 epochs, with 360828 positive edges
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:04:51 Optimization finished
plotly::ggplotly(DimPlot(orig.RNA.seu, group.by="annotation"))

Prediction score

Quantifies the uncertainty of the prediction. Calculated differently for every method, but used to define which cells are “unassigned”.

ggpubr::ggarrange(predict_score_hist, predict_score_cumedist, common.legend = TRUE, widths = c(0.8, 1.2),
          labels=c("A", "B")) +
  ggsave(paste0(outdir, "prediction_score_distribution.png"), height = 6, width = 10)
Removed 22 rows containing non-finite values (stat_ecdf).

Cell type composition

Compare cell type fractions (w uncertainty)

pred.labels.df %>%
  group_by(method) %>%
  drop_na() %>%
  mutate(tot.cells=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id) %>%
  summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=mean(score)) %>%
  mutate(frac.label=tot.label/tot.cells) %>%
  bind_rows(orig.frac.df) %>%
  mutate(orig.rank = orig.rank.df[predicted.id,]) %>%
  mutate(predicted.id=factor(predicted.id, levels=rownames(orig.rank.df)))%>%
  # select(method, predicted.id, frac.label) %>%
  # distinct() %>%
  ggplot(aes(predicted.id, frac.label, fill=mean.score, color=mean.score)) +
  geom_point() +
  geom_col(width=0.1) +
  coord_flip() +
  # geom_line(aes(group=method)) +
  facet_wrap(method~., nrow=1, ncol=4, scales="free_x") +
  scale_color_viridis_c() +
  scale_fill_viridis_c() +
  ylab("Fraction of cells") +
  theme_bw(base_size = 16) +
  ggsave(paste0(outdir, "cell_type_composition_bars.png"), width = 15, height = 7)
binding character and factor vector, coercing into character vector

Does the uncertainty depend on the size of the cluster?

pred.labels.df %>%
  group_by(method) %>%
  drop_na() %>%
  mutate(tot.cells=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id) %>%
  summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=median(score), sd.score=mad(score)) %>%
  mutate(frac.label=tot.label/tot.cells) %>%
  # bind_rows(orig.frac.df) %>%
  ggplot(aes(frac.label, mean.score, color=method)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=mean.score-sd.score, ymax=mean.score+sd.score), alpha=0.6) +
  scale_color_brewer(palette="Set1") +
  # geom_smooth(method = "loess", span=1.2) +
  facet_grid(. ~ method) +
  theme_bw(base_size = 16) +
  stat_cor(label.x = 0.2, label.y=0.25, color="black", size=5) 

NA
NA

Agreement with unsupervised clustering of ATAC data

Calculate which fractions of NNs in bin based graph of ATAC cells have the same annotation

k = 50
atac.seu <- FindNeighbors(atac.seu, assay = "ATAC", reduction = "SnapATAC", dims = 1:15, k.param = k)
Computing nearest neighbor graph
Computing SNN
atac.nn.list <- getNNlist(atac.seu)
score.CCA <- imap_dbl(atac.nn.list, ~ sum(pred.cca[.x,1] == pred.cca[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Conos <- imap_dbl(atac.nn.list, ~ sum(pred.conos[.x,1] == pred.conos[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Liger <- imap_dbl(atac.nn.list, ~ sum(pred.liger[.x,1] == pred.liger[.y,1])/k) %>% setNames(names(atac.nn.list))
knn_score_df <-
  as.data.frame(cbind(score.Conos, score.Liger, score.CCA)) %>%
  rownames_to_column("cell") %>%
  pivot_longer(cols=str_subset(colnames(.), "score"), names_to = "method", values_to = "KNN_score") %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score),
                method=str_remove(method, "score."))
quants = seq(0,1, by = 0.05)
AUECDF_knn_score <- knn_score_df %>%
  split(.$method) %>%
  map_dbl( ~ .x %>%
      arrange(KNN_score) %>% 
      {ecdf(.$KNN_score)(quants)} %>% AUC(quants,.)
    )
  
knn_score_df %>%
  mutate(AUC=AUECDF_knn_score[method]) %>%
  ggplot(aes(KNN_score, color=method, fill=method)) +
  stat_ecdf(size=1) +
  scale_color_brewer(palette = "Set1") +
  geom_text(data=. %>% group_by(method) %>% summarise(AUC=max(AUC)), 
            x=0.05, hjust=0,
            aes(label=glue("AUC = {round(AUC, 3)}"), y=c(0.90, 0.95, 1))) +
  theme_bw(base_size = 16) +
  ylab("ECDF") +
  ggsave(paste(outdir,"KNN_score_ecdf_unionHVG.png"), height = 4, width=6)

full_join(pred.labels.df, knn_score_df) %>%
  ggplot(aes(KNN_score, color=method)) +
  stat_ecdf() +
  facet_wrap("predicted.id") +
  scale_color_brewer(palette = "Set1") +
  coord_fixed()
Joining, by = c("cell", "method")

plot_KNNecdf <- function(cluster){
  full_join(pred.labels.df, knn_score_df) %>%
    filter(predicted.id==cluster) %>%
    ggplot(aes(KNN_score, color=method)) +
    stat_ecdf(size=0.8) +
    facet_wrap("predicted.id") +
    xlim(0,1) + ylim(0,1) +
    coord_fixed() +
    scale_color_brewer(palette = "Set1") +
    theme_bw(base_size = 16) +
    theme(legend.position = "top")
}

DimPlotCluster <- function(annotation_col, cluster, label){
  highlight = which(atac.seu@meta.data[,annotation_col]==cluster)
  DimPlot(atac.seu, reduction = "umap.snap",cells.highlight = highlight, cols.highlight = "red", pt.size = 0.02, sizes.highlight = 0.1) +
    guides(color="none") +
    ggtitle(label = label)
  }

UMAPs_cluster <- function(cluster){
  ggarrange(plotlist=imap(list(CCA="predicted.id_CCA", Conos="predicted.id_Conos", Liger="predicted.id_Liger"), ~ DimPlotCluster(.x, cluster, label = .y )), ncol=3, nrow=1) %>% annotate_figure(cluster)
}

map(cell.types, ~ ggarrange(plot_KNNecdf(.x), UMAPs_cluster(.x), nrow = 2, heights = c(1,0.8)))

Which cells are inconsistently aligned?

pred.labels.df %>%
  select(method, predicted.id, cell) %>%
  mutate(predicted.id=ifelse(is.na(predicted.id), "none", predicted.id)) %>%
  ggplot(aes(x=method, stratum=predicted.id, alluvium=cell, fill=predicted.id, label=predicted.id)) +
  geom_flow() +
  geom_stratum(color=NA) +
  geom_text(stat="stratum") +
  theme_bw(base_size = 16)

Which cells are inconsistently scored?

library(ggalluvial)
pred.labels.df %>%
  select(method, predicted.id, cell) %>%
  mutate(predicted.id=ifelse(is.na(predicted.id), "none", predicted.id)) %>%
  ggplot(aes(x=method, stratum=predicted.id, alluvium=cell, fill=predicted.id, label=predicted.id)) +
  geom_flow() +
  geom_stratum() +
  geom_text(stat="stratum") +
  theme_bw(base_size = 16)

Accessibility of markers

Taking markers from Fig. S2 of JP’s manuscript

thymus.markers <- c("PTPRC", "CD3G", "TYROBP","CD19","HOXA9",'FXYD2',"SH3TC1","CCR9","CD8A", "CD8B","PDCD1", "CRTAM","CD40LG","CCR6","FOXP3","SOX13","ZNF683","KLRD1","TNFSF11","VPREB1","MS4A1", "CLEC9A", "CLEC10A", "LAMP3", "IL3RA", "FCGR3B", "C2","TPSB2",
                    'ITGA2B',"GYPA", "CDH5", "RGS5","CDH1", "PDGFRA","CRABP1")
# pbmc.markers <- c("CD79A", "MS4A1", "CD8A", "CD8B", "LYZ")
# thymus.markers <- list(Fb=c("PDGFRA", "COLEC11", "FBN1", "PI16"),
#                        VSMC=c("PDGFRB", 'ACTA2', "RGS5"),
#                        Endo=c("PECAM1", "CDH5","LYVE1"),
#                        TEC = c("EPCAM", "FOXN1", "CCL25", "CCL19")
#                        )
thymus.markers.df <- imap(thymus.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
  purrr::reduce(bind_rows)
marker.access.df <- atac.seu@assays$ACTIVITY@data[intersect(thymus.markers, rownames(atac.seu@assays$ACTIVITY)),] %>%
  as.matrix() %>%
  reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
  full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
  # full_join(thymus.markers.df) %>%
  pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
  dplyr::mutate(method=str_remove(method,".+_")) %>%
  filter(method %in% c("CCA", "Liger", "Conos")) 
ordered_cell_types <- c("DN(Q)", "DN(P)", "DP(Q)", "DP(P)", "αβT(entry)", "CD8+T", "CD4+T", "CD4+Tmem", 'Treg', "γδT", "NK", "ILC3", "DC1", "DC2", "aDC", "pDC", "Mac", "Ery", "Endo", "VSMC", "FB_1", "Fb_2")
markers_pl <- 
  marker.access.df %>%
  mutate(predicted.id = case_when(str_detect(predicted.id, "CD8") ~ "CD8+T",
                                  # str_detect(predicted.id, "CD4") ~ "CD4+T",
                                  TRUE ~ predicted.id
                                  )
         ) %>%
  mutate(predicted.id=factor(predicted.id, levels = ordered_cell_types)) %>%
  group_by(method, predicted.id, gene) %>%
  dplyr::mutate(frac.cells=sum(log.counts > 0)/n()) %>%
  # filter(method=="CCA") %>%
  ungroup() %>%
  ggplot( aes( gene, predicted.id)) +
  geom_point(aes(size=frac.cells, color=frac.cells)) +
  facet_grid(method~., space="free", scales="free_x") +
  scale_color_gradient(high="darkblue", low="white") +
  # scale_color_viridis_c() +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
        strip.text.x = element_text(angle=45)) 
markers_pl 

  
ggsave(paste0(outdir, "Thymus_markers_accessibility.png"), height = 16, width = 12)

Reproducing Fig.2H on T-cell development

tcells.markers.df %>%
  full_join(t.cell.markers.df) %>%
  # filter(method=="CCA") %>%
  mutate(predicted.id=factor(predicted.id, levels=ordered.tcells)) %>%
  ggplot(aes( predicted.id, gene)) +
  facet_grid(cell.type.class~method, scales = "free_y", space="free") +
  geom_point(aes(size=frac.cells, color=frac.cells)) +
  scale_color_gradient(high="darkblue", low="white") +
  # scale_color_gradient2(midpoint = 0.5) +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
        strip.text.y = element_text(angle=0)) 
Joining, by = "gene"
Column `gene` joining factor and character vector, coercing into character vector

Compare feature selection strategy (reference based)

integrate_features_ref <- scan("~/models/intFeatures_reference_hvg_2000_F74_SCElist_20191101.txt", what = "")
Read 1893 items

score.CCA.ref <-   imap_dbl(atac.nn.list, ~ sum(pred.cca.ref[.x,1] == pred.cca.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Conos.ref <- imap_dbl(atac.nn.list, ~ sum(pred.conos.ref[.x,1] == pred.conos.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Liger.ref <- imap_dbl(atac.nn.list, ~ sum(pred.liger.ref[.x,1] == pred.liger.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
knn_score_df <-
  as.data.frame(cbind(score.Conos.ref, score.Liger.ref, score.CCA.ref)) %>%
  rownames_to_column("cell") %>%
  pivot_longer(cols=str_subset(colnames(.), "score"), names_to = "method", values_to = "KNN_score") %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score),
                method=str_remove(method, "score."))
quants = seq(0,1, by = 0.05)
AUECDF_knn_score <- knn_score_df %>%
  split(.$method) %>%
  map_dbl( ~ .x %>%
      arrange(KNN_score) %>% 
      {ecdf(.$KNN_score)(quants)} %>% AUC(quants,.)
    )
  
knn_score_df %>%
  mutate(AUC=AUECDF_knn_score[method]) %>%
  ggplot(aes(KNN_score, color=method, fill=method)) +
  stat_ecdf(size=1) +
  scale_color_brewer(palette = "Set1") +
  geom_text(data=. %>% group_by(method) %>% summarise(AUC=max(AUC)), 
            x=0.05, hjust=0,
            aes(label=glue("AUC = {round(AUC, 3)}"), y=c(0.90, 0.95, 1))) +
  theme_bw(base_size = 16) +
  ylab("ECDF") 

Is the union or the reference best maintaining the structure of the ATAC?


Thoughts

  • Conos scores a lot of cells with high confidence, but fails to assign cells to difficult clusters
  • CCA resembles the composition of the RNA data better, but curious that the other methods identify way more
---
title: "Label transfer EDA"
output: html_notebook
---


```{r}
library(Seurat)
library(SnapATAC)
library(tidyverse)
library(DescTools)  # 4 AUC function
library(glue)
library(ggalluvial)  # 4 river plot
library(ggpubr)
source("~/multiOmic_benchmark/utils.R")


gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

## Make output directory
outdir <- "~/multiOmic_benchmark/report/output/20191106_labelTransferEDA_F74/"
ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
```


```{r}
model.cca <- readRDS("~/models/modelCCA_union_hvg_F74_SCElist_20191101.RDS")
model.liger <- readRDS("~/models/modelLiger_union_hvg_F74_SCElist_20191101.RDS")
model.conos <- readRDS("~/models/modelConos_union_hvg_F74_SCElist_20191101.RDS")

seu.cca <- readRDS("~/models/labelTransferCCA_union_hvg_F74_SCElist_20191101.RDS")
seu.liger <- readRDS("~/models/labelTransferLiger_union_hvg_F74_SCElist_20191101.RDS")
seu.conos <- readRDS("~/models/labelTransferConos_union_hvg_F74_SCElist_20191101.RDS")


integrate_features <- scan("~/models/intFeatures_union_hvg_2000_F74_SCElist_20191101.txt", what='')

int.list <- list(CCA=seu.cca, Liger=seu.liger, Conos=seu.conos)

## Make method color palette
method.palette <- brewer_palette_4_values(names(int.list), "Set1")

```

### Embeddings
Visualize label transfer on original ATAC data (embedded SnapATAC bins)
```{r}
## Load original data
orig.ATAC <- readRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")
sce.list <- readRDS("~/my_data/integrated_thymus/F74_SCElist_20191101.RDS")
orig.RNA <- sce.list$RNA

## Make SeuratObjects
atac.seu <- snapToSeurat(
    obj=orig.ATAC, 
    eigs.dims=1:20, 
    norm=TRUE,
    scale=TRUE
    )
atac.seu <- RenameCells(atac.seu, new.names = orig.ATAC@metaData$barcode)

## Add cell type predictions
getPredictedLabels <- function(seu.int, int.name, id.col="predicted.id", score.col="score"){
  pred.df <- seu.int$ATAC@meta.data[,c(id.col, score.col), drop=F] 
  rownames(pred.df) <- str_remove(rownames(pred.df), "^ATAC_")
  colnames(pred.df) <- c(str_c("predicted.id", "_", int.name), str_c("score", "_", int.name))
  pred.df
  }

pred.cca <- getPredictedLabels(seu.cca, "CCA", score.col = "prediction.score.max")
pred.liger <- getPredictedLabels(seu.liger, "Liger")
pred.conos <- getPredictedLabels(seu.conos, "Conos")

if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
  atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca, pred.liger, pred.conos))
} else {
  stop("Non corresponding cell names")
}
```

```{r, fig.height=8, fig.width=18}
## make cell type palette
cell.types <- levels(seu.cca$RNA$annotation)
cell.type.pal <- setNames(gg_color_hue(length(cell.types)), cell.types)

atac.seu <- RunUMAP(atac.seu, reduction = "SnapATAC", reduction.name = "umap.snap", dims=1:20)

ggpubr::ggarrange(
  plotlist = list(
    DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_CCA"  , cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("CCA"),
    DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Liger", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Liger"),
    DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Conos", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Conos")
  ),
  common.legend = TRUE, ncol=3, nrow=1
) +
  ggsave(paste0(outdir, "umap_labels.png"), width=16, height = 8)


```

```{r}
pl <-     DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Liger", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Liger")
plotly::ggplotly(pl)
```


```{r}
orig.RNA.seu <- as.Seurat(orig.RNA)
orig.RNA.seu <- FindVariableFeatures(orig.RNA.seu)
orig.RNA.seu <- ScaleData(orig.RNA.seu)
orig.RNA.seu <- RunPCA(orig.RNA.seu)
orig.RNA.seu <- RunUMAP(orig.RNA.seu, dims=1:30)

plotly::ggplotly(DimPlot(orig.RNA.seu, group.by="annotation"))
```

## Prediction score
Quantifies the uncertainty of the prediction. Calculated differently for every method, but used to define which cells are "unassigned".


```{r}
orig.composition <- orig.RNA$annotation
orig.frac <- table(orig.composition)/length(orig.composition)

orig.frac.df <- data.frame(orig.frac) %>%
  dplyr::rename(predicted.id=orig.composition, frac.label=Freq) %>%
  mutate(method="original.RNA")

score_cols <- str_subset(colnames(atac.seu@meta.data), 'score_')
label_cols <- str_subset(colnames(atac.seu@meta.data), 'predicted.id_')

pred.labels.df <- imap(list(CCA=pred.cca, Liger=pred.liger, Conos=pred.conos), ~ 
      rownames_to_column(.x, "cell") %>%
      rename_all(funs(str_remove(., str_c("_",.y)))) %>%
      mutate(method=.y)
    ) %>%
  purrr::reduce(bind_rows) %>%
  mutate(score=ifelse(is.na(score), 0, score))

predict_score_hist <- 
  pred.labels.df %>%
  ggplot(aes(score, fill=method)) +
  geom_histogram(position="identity", alpha=0.8, bins=40) +
  facet_grid(method ~.) +
  scale_fill_brewer(palette="Set1") +
  xlab("Label prediction score") +
  theme_bw(base_size = 16) +
  theme(legend.position = "top")

cutoffs <- seq(0,1,0.05)
predict_score_cumedist <-
  pred.labels.df %>%
  group_by(method) %>%
  mutate(bins=cut(score, breaks = cutoffs)) %>%
  mutate(score=as.numeric(str_remove_all(as.character(bins), ".+,|]"))) %>%
  ggplot(aes(score, color=method)) +
  stat_ecdf(size=0.8, alpha=0.7) +
  scale_color_brewer(palette = "Set1") +
  ylab("Fraction of unassigned cells") +
  xlab("Prediction score cutoff") +
  theme_bw(base_size = 16) +
  xlim(0,1) +
  coord_fixed() +
  guides(color="none") 

ggpubr::ggarrange(predict_score_hist, predict_score_cumedist, common.legend = TRUE, widths = c(0.8, 1.2),
          labels=c("A", "B")) +
  ggsave(paste0(outdir, "prediction_score_distribution.png"), height = 6, width = 10)
```

```{r, fig.width=16, fig.height=8}
ggpubr::ggarrange(
  plotlist = list(
    FeaturePlot(atac.seu, reduction = "umap.snap", feature = "score_CCA"  , coord.fixed = TRUE) + ggtitle("CCA"),
    FeaturePlot(atac.seu, reduction = "umap.snap", feature = "score_Liger", coord.fixed = TRUE) + ggtitle("Liger"),
    FeaturePlot(atac.seu, reduction = "umap.snap", feature = "score_Conos", coord.fixed = TRUE) + ggtitle("Conos")
  ),
  common.legend = TRUE, ncol=3, nrow=1
) +
  ggsave(paste0(outdir, "prediction_score_umaps.png"), height = 7, width=14)
```


## Cell type composition

Compare cell type fractions (w uncertainty)
```{r, fig.height=7, fig.width=6}

pred.labels.df %>%
  group_by(method) %>%
  drop_na() %>%
  mutate(tot.cells=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id) %>%
  summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=mean(score)) %>%
  mutate(frac.label=tot.label/tot.cells) %>%
  # bind_rows(orig.frac.df) %>%
  ggplot(aes(method, predicted.id)) +
  geom_point(aes(color=mean.score, size=frac.label)) +
  scale_color_viridis_c(name="Mean prediction\nscore") +
  scale_shape(name="Fraction of cells") +
  # scale_color_gradient(low ="grey", high="red") +
  theme_classic(base_size = 16) +
  ggsave(paste0(outdir, "cell_type_composition.png"), width=6, height = 6)
  

```
```{r, fig.width=14, fig.height=7}
orig.rank.df <- orig.frac.df %>% 
  mutate(orig.rank=dense_rank(frac.label)) %>%
  select(orig.rank, predicted.id) %>%
  distinct() %>%
  arrange(orig.rank) %>%
  column_to_rownames("predicted.id") 

pred.labels.df %>%
  group_by(method) %>%
  drop_na() %>%
  mutate(tot.cells=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id) %>%
  summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=mean(score)) %>%
  mutate(frac.label=tot.label/tot.cells) %>%
  bind_rows(orig.frac.df) %>%
  mutate(orig.rank = orig.rank.df[predicted.id,]) %>%
  mutate(predicted.id=factor(predicted.id, levels=rownames(orig.rank.df)))%>%
  # select(method, predicted.id, frac.label) %>%
  # distinct() %>%
  ggplot(aes(predicted.id, frac.label, fill=mean.score, color=mean.score)) +
  geom_point() +
  geom_col(width=0.1) +
  coord_flip() +
  # geom_line(aes(group=method)) +
  facet_wrap(method~., nrow=1, ncol=4, scales="free_x") +
  scale_color_viridis_c() +
  scale_fill_viridis_c() +
  ylab("Fraction of cells") +
  theme_bw(base_size = 16) +
  ggsave(paste0(outdir, "cell_type_composition_bars.png"), width = 15, height = 7)
```

Does the uncertainty depend on the size of the cluster?
```{r, fig.width=14, fig.height=5}

pred.labels.df %>%
  group_by(method) %>%
  drop_na() %>%
  mutate(tot.cells=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id) %>%
  summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=median(score), sd.score=mad(score)) %>%
  mutate(frac.label=tot.label/tot.cells) %>%
  # bind_rows(orig.frac.df) %>%
  ggplot(aes(frac.label, mean.score, color=method)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=mean.score-sd.score, ymax=mean.score+sd.score), alpha=0.6) +
  scale_color_brewer(palette="Set1") +
  # geom_smooth(method = "loess", span=1.2) +
  facet_grid(. ~ method) +
  theme_bw(base_size = 16) +
  stat_cor(label.x = 0.2, label.y=0.25, color="black", size=5) 
  
  
```

### Agreement with unsupervised clustering of ATAC data
Calculate which fractions of NNs in bin based graph of ATAC cells have the same annotation
```{r}
k = 50
atac.seu <- FindNeighbors(atac.seu, assay = "ATAC", reduction = "SnapATAC", dims = 1:15, k.param = k)

atac.nn.list <- getNNlist(atac.seu)

score.CCA <- imap_dbl(atac.nn.list, ~ sum(pred.cca[.x,1] == pred.cca[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Conos <- imap_dbl(atac.nn.list, ~ sum(pred.conos[.x,1] == pred.conos[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Liger <- imap_dbl(atac.nn.list, ~ sum(pred.liger[.x,1] == pred.liger[.y,1])/k) %>% setNames(names(atac.nn.list))

knn_score_df <-
  as.data.frame(cbind(score.Conos, score.Liger, score.CCA)) %>%
  rownames_to_column("cell") %>%
  pivot_longer(cols=str_subset(colnames(.), "score"), names_to = "method", values_to = "KNN_score") %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score),
                method=str_remove(method, "score."))

quants = seq(0,1, by = 0.05)
AUECDF_knn_score <- knn_score_df %>%
  split(.$method) %>%
  map_dbl( ~ .x %>%
      arrange(KNN_score) %>% 
      {ecdf(.$KNN_score)(quants)} %>% AUC(quants,.)
    )
  
knn_score_df %>%
  mutate(AUC=AUECDF_knn_score[method]) %>%
  ggplot(aes(KNN_score, color=method, fill=method)) +
  stat_ecdf(size=1) +
  scale_color_brewer(palette = "Set1") +
  geom_text(data=. %>% group_by(method) %>% summarise(AUC=max(AUC)), 
            x=0.05, hjust=0,
            aes(label=glue("AUC = {round(AUC, 3)}"), y=c(0.90, 0.95, 1))) +
  theme_bw(base_size = 16) +
  ylab("ECDF") +
  ggsave(paste(outdir,"KNN_score_ecdf_unionHVG.png"), height = 4, width=6)
```

```{r, fig.height=8, fig.width=8}

full_join(pred.labels.df, knn_score_df) %>%
  ggplot(aes(KNN_score, color=method)) +
  stat_ecdf() +
  facet_wrap("predicted.id") +
  scale_color_brewer(palette = "Set1") +
  coord_fixed()
```
```{r, fig.height=8, fig.width=7, message=FALSE}
plot_KNNecdf <- function(cluster){
  full_join(pred.labels.df, knn_score_df) %>%
    filter(predicted.id==cluster) %>%
    ggplot(aes(KNN_score, color=method)) +
    stat_ecdf(size=0.8) +
    facet_wrap("predicted.id") +
    xlim(0,1) + ylim(0,1) +
    coord_fixed() +
    scale_color_brewer(palette = "Set1") +
    theme_bw(base_size = 16) +
    theme(legend.position = "top")
}

DimPlotCluster <- function(annotation_col, cluster, label){
  highlight = which(atac.seu@meta.data[,annotation_col]==cluster)
  DimPlot(atac.seu, reduction = "umap.snap",cells.highlight = highlight, cols.highlight = "red", pt.size = 0.02, sizes.highlight = 0.1) +
    guides(color="none") +
    ggtitle(label = label)
  }

UMAPs_cluster <- function(cluster){
  ggarrange(plotlist=imap(list(CCA="predicted.id_CCA", Conos="predicted.id_Conos", Liger="predicted.id_Liger"), ~ DimPlotCluster(.x, cluster, label = .y )), ncol=3, nrow=1) %>% annotate_figure(cluster)
}

map(cell.types, ~ ggarrange(plot_KNNecdf(.x), UMAPs_cluster(.x), nrow = 2, heights = c(1,0.8)))

```


#### Which cells are inconsistently aligned?
```{r, fig.width=14, fig.height=10}
pred.labels.df %>%
  select(method, predicted.id, cell) %>%
  mutate(predicted.id=ifelse(is.na(predicted.id), "none", predicted.id)) %>%
  ggplot(aes(x=method, stratum=predicted.id, alluvium=cell, fill=predicted.id, label=predicted.id)) +
  geom_flow() +
  geom_stratum(color=NA) +
  geom_text(stat="stratum") +
  theme_bw(base_size = 16)
```




#### Which cells are inconsistently scored?
```{r, fig.width=14, fig.height=8}
library(ggalluvial)
pred.labels.df %>%
  select(method, predicted.id, cell) %>%
  mutate(predicted.id=ifelse(is.na(predicted.id), "none", predicted.id)) %>%
  ggplot(aes(x=method, stratum=predicted.id, alluvium=cell, fill=predicted.id, label=predicted.id)) +
  geom_flow() +
  geom_stratum() +
  geom_text(stat="stratum") +
  theme_bw(base_size = 16)
```

## Accessibility of markers
Taking markers from Fig. S2 of JP's manuscript
```{r, fig.height=13, fig.width=10, warning=FALSE, message=FALSE}
thymus.markers <- c("PTPRC", "CD3G", "TYROBP","CD19","HOXA9",'FXYD2',"SH3TC1","CCR9","CD8A", "CD8B","PDCD1", "CRTAM","CD40LG","CCR6","FOXP3","SOX13","ZNF683","KLRD1","TNFSF11","VPREB1","MS4A1", "CLEC9A", "CLEC10A", "LAMP3", "IL3RA", "FCGR3B", "C2","TPSB2",
                    'ITGA2B',"GYPA", "CDH5", "RGS5","CDH1", "PDGFRA","CRABP1")
# pbmc.markers <- c("CD79A", "MS4A1", "CD8A", "CD8B", "LYZ")
# thymus.markers <- list(Fb=c("PDGFRA", "COLEC11", "FBN1", "PI16"),
#                        VSMC=c("PDGFRB", 'ACTA2', "RGS5"),
#                        Endo=c("PECAM1", "CDH5","LYVE1"),
#                        TEC = c("EPCAM", "FOXN1", "CCL25", "CCL19")
#                        )
thymus.markers.df <- imap(thymus.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
  purrr::reduce(bind_rows)

marker.access.df <- atac.seu@assays$ACTIVITY@data[intersect(thymus.markers, rownames(atac.seu@assays$ACTIVITY)),] %>%
  as.matrix() %>%
  reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
  full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
  # full_join(thymus.markers.df) %>%
  pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
  dplyr::mutate(method=str_remove(method,".+_")) %>%
  filter(method %in% c("CCA", "Liger", "Conos")) 

ordered_cell_types <- c("DN(Q)", "DN(P)", "DP(Q)", "DP(P)", "αβT(entry)", "CD8+T", "CD4+T", "CD4+Tmem", 'Treg', "γδT", "NK", "ILC3", "DC1", "DC2", "aDC", "pDC", "Mac", "Ery", "Endo", "VSMC", "FB_1", "Fb_2")

markers_pl <- 
  marker.access.df %>%
  mutate(predicted.id = case_when(str_detect(predicted.id, "CD8") ~ "CD8+T",
                                  # str_detect(predicted.id, "CD4") ~ "CD4+T",
                                  TRUE ~ predicted.id
                                  )
         ) %>%
  mutate(predicted.id=factor(predicted.id, levels = ordered_cell_types)) %>%
  group_by(method, predicted.id, gene) %>%
  dplyr::mutate(frac.cells=sum(log.counts > 0)/n()) %>%
  # filter(method=="CCA") %>%
  ungroup() %>%
  ggplot( aes( gene, predicted.id)) +
  geom_point(aes(size=frac.cells, color=frac.cells)) +
  facet_grid(method~., space="free", scales="free_x") +
  scale_color_gradient(high="darkblue", low="white") +
  # scale_color_viridis_c() +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
        strip.text.x = element_text(angle=45)) 

markers_pl 
  
ggsave(paste0(outdir, "Thymus_markers_accessibility.png"), height = 16, width = 12)
```

Reproducing Fig.2H on T-cell development
```{r, fig.width=10, fig.height=12}
t.cell.markers <- list(known.markers = c("CD34", "IGLL1", "TRGC2", "TRDC", "PTCRA", "TRBC2", "TRAC", "CD4", "CD8A", "CD8B"),
                       chemokine.receptors = c("CCR9", "CCR7"),
                       tcr.activation = c("CD5", "CD27"),
                       proliferation=c("PCNA", "CDK1", "MKI67"),
                       cyclin.D = c("CCND2", "CCND3"),
                       recombination=c("RAG1", "RAG2"),
                       apoptosis=c("HRK","BMF", "TP53INP1"),
                       stage.markers = c("ST18", "HIVEP3", "RGPD3", "SMPD3", "AQP3", "RORC", "SATB1", "TOX2")
                       ) 
t.cell.markers.df <- imap(t.cell.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
  purrr::reduce(bind_rows)

ordered.tcells <- c("DN(P)", "DN(Q)", "DP(P)", "DP(Q)","αβT(entry)", "CD8+T", "CD4+T")

tcells.markers.df <- 
  atac.seu@assays$ACTIVITY@data[intersect(unlist(t.cell.markers), rownames(atac.seu@assays$ACTIVITY)),] %>%
  as.matrix() %>%
  reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
  full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
  pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
  dplyr::mutate(method=str_remove(method,".+_")) %>%
  filter(method %in% c("CCA", "Liger", "Conos")) %>%
  mutate(predicted.id=ifelse(str_detect(predicted.id, "CD8+"), "CD8+T", predicted.id)) %>%
  mutate(predicted.id=ifelse(str_detect(predicted.id, "CD4+"), "CD4+T", predicted.id)) %>%
  filter(predicted.id %in% ordered.tcells) %>%
  group_by(method, predicted.id, gene) %>%
  dplyr::mutate(frac.cells=sum(log.counts > 0)/n(), mean.acc=mean(log.counts)) %>%
  ungroup() 

tcells.markers.df %>%
  full_join(t.cell.markers.df) %>%
  # filter(method=="CCA") %>%
  mutate(predicted.id=factor(predicted.id, levels=ordered.tcells)) %>%
  ggplot(aes( predicted.id, gene)) +
  facet_grid(cell.type.class~method, scales = "free_y", space="free") +
  geom_point(aes(size=frac.cells, color=frac.cells)) +
  scale_color_gradient(high="darkblue", low="white") +
  # scale_color_gradient2(midpoint = 0.5) +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
        strip.text.y = element_text(angle=0)) 

ggsave(paste0(outdir, "tcell_markers.png"), height = 14, width = 14)

```

### Compare feature selection strategy (reference based)
```{r}
seu.cca.ref <- readRDS("~/models/labelTransferCCA_reference_hvg_F74_SCElist_20191101.RDS")
seu.liger.ref <- readRDS("~/models/labelTransferLiger_reference_hvg_F74_SCElist_20191101.RDS")
seu.conos.ref <- readRDS("~/models/labelTransferConos_reference_hvg_F74_SCElist_20191101.RDS")

integrate_features_ref <- scan("~/models/intFeatures_reference_hvg_2000_F74_SCElist_20191101.txt", what = "")

int.list.ref <- list(CCA=seu.cca.ref, Liger=seu.liger.ref, Conos=seu.conos.ref)

## Add to atac Seurat object
pred.cca.ref <- getPredictedLabels(seu.cca.ref, "CCA_ref", score.col = "prediction.score.max")
pred.liger.ref <- getPredictedLabels(seu.liger.ref, "Liger_ref")
pred.conos.ref <- getPredictedLabels(seu.conos.ref, "Conos_ref")

if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
  atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca.ref, pred.liger.ref, pred.conos.ref))
} else {
  stop("Non corresponding cell names")
}

```

```{r, fig.width=19, fig.height=9}
ggpubr::ggarrange(
  plotlist = list(
    DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_CCA_ref"  , cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("CCA"),
    DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Liger_ref", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Liger"),
    DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Conos_ref", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Conos")
  ),
  common.legend = TRUE, ncol=3, nrow=1
) 
```

```{r, fig.height=10, fig.width=16}
pred.labels.ref.df <- imap(list(CCA=pred.cca.ref, Liger=pred.liger.ref, Conos=pred.conos.ref), ~ 
      rownames_to_column(.x, "cell") %>%
      rename_all(funs(str_remove(., str_c("_",.y)))) %>%
      mutate(method=.y)
    ) %>%
  purrr::reduce(bind_rows) %>%
  mutate(score=ifelse(is.na(score_ref), 0, score_ref))

full_join(
  pred.labels.df,
  select(pred.labels.ref.df, cell, predicted.id_ref, score_ref, method),
  by=c("cell", "method")
  ) %>%
  group_by(method, predicted.id) %>%
  mutate(n_pred=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id, predicted.id_ref) %>%
  summarise(n=n(), n_pred=max(n_pred)) %>%
  mutate(frac=n/n_pred) %>%
  ggplot(aes(predicted.id, predicted.id_ref)) +
  geom_tile(aes(fill=frac)) +
  facet_wrap(method~., nrow=1, ncol=3) +
  coord_fixed() +
  scale_fill_gradient(low="white", high="red") +
  ylab("Feat. selection: reference HVG") + xlab("Feat. selection: union HVG") +
  theme_cowplot(font_size = 16) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  ggsave(paste0(outdir, "unionVSreference.png"), height = 12, width=10)
```
```{r}

score.CCA.ref <-   imap_dbl(atac.nn.list, ~ sum(pred.cca.ref[.x,1] == pred.cca.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Conos.ref <- imap_dbl(atac.nn.list, ~ sum(pred.conos.ref[.x,1] == pred.conos.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Liger.ref <- imap_dbl(atac.nn.list, ~ sum(pred.liger.ref[.x,1] == pred.liger.ref[.y,1])/k) %>% setNames(names(atac.nn.list))

knn_score_ref_df <-
  as.data.frame(cbind(score.Conos.ref, score.Liger.ref, score.CCA.ref)) %>%
  rownames_to_column("cell") %>%
  pivot_longer(cols=str_subset(colnames(.), "score"), names_to = "method", values_to = "KNN_score") %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score),
                method=str_remove(method, "score."))

quants = seq(0,1, by = 0.05)
AUECDF_knn_score <- knn_score_ref_df %>%
  split(.$method) %>%
  map_dbl( ~ .x %>%
      arrange(KNN_score) %>% 
      {ecdf(.$KNN_score)(quants)} %>% AUC(quants,.)
    )
  
knn_score_ref_df %>%
  mutate(AUC=AUECDF_knn_score[method]) %>%
  ggplot(aes(KNN_score, color=method, fill=method)) +
  stat_ecdf(size=1) +
  scale_color_brewer(palette = "Set1") +
  geom_text(data=. %>% group_by(method) %>% summarise(AUC=max(AUC)), 
            x=0.05, hjust=0,
            aes(label=glue("AUC = {round(AUC, 3)}"), y=c(0.90, 0.95, 1))) +
  theme_bw(base_size = 16) +
  ylab("ECDF") 
```

### Is the union or the reference best maintaining the structure of the ATAC?
```{r, fig.width=15,fig.height=7}
k = 50
atac.seu <- FindNeighbors(atac.seu, assay = "ATAC", reduction = "SnapATAC", dims = 1:15, k.param = k)

atac.nn.list <- getNNlist(atac.seu)

score.CCA <- imap_dbl(atac.nn.list, ~ sum(pred.cca[.x,1] == pred.cca[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Conos <- imap_dbl(atac.nn.list, ~ sum(pred.conos[.x,1] == pred.conos[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Liger <- imap_dbl(atac.nn.list, ~ sum(pred.liger[.x,1] == pred.liger[.y,1])/k) %>% setNames(names(atac.nn.list))

knn_score_df <-
  as.data.frame(cbind(score.Conos, score.Liger, score.CCA)) %>%
  rownames_to_column("cell") %>%
  pivot_longer(cols=str_subset(colnames(.), "score"), names_to = "method", values_to = "KNN_score") %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score),
                method=str_remove(method, "score."))


score.CCA.ref <-   imap_dbl(atac.nn.list, ~ sum(pred.cca.ref[.x,1] == pred.cca.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Conos.ref <- imap_dbl(atac.nn.list, ~ sum(pred.conos.ref[.x,1] == pred.conos.ref[.y,1])/k) %>% setNames(names(atac.nn.list))
score.Liger.ref <- imap_dbl(atac.nn.list, ~ sum(pred.liger.ref[.x,1] == pred.liger.ref[.y,1])/k) %>% setNames(names(atac.nn.list))

knn_score_ref_df <-
  as.data.frame(cbind(score.Conos.ref, score.Liger.ref, score.CCA.ref)) %>%
  rownames_to_column("cell") %>%
  pivot_longer(cols=str_subset(colnames(.), "score"), names_to = "method", values_to = "KNN_score") %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score),
                method=str_remove(method, "score."))


bind_rows(knn_score_df, knn_score_ref_df) %>%
  mutate(feature.selection=ifelse(str_detect(method, "ref"), "ref", "union")) %>%
  mutate(method=str_remove(method, ".ref")) %>%
  ggplot(aes(KNN_score, color=feature.selection, fill=method)) +
  stat_ecdf(size=1) +
  scale_color_brewer(palette = "Set1") +
  facet_wrap(method~.) +
  theme_bw(base_size = 16) +
  ylab("ECDF") +
  ggtitle(paste("K =", k)) +
  ggsave(paste0(outdir, "unionVSreference_KNN.png"), height = 4, width = 10)

```

---

```{r}
plotly::ggplotly(DimPlot(orig.RNA.seu, reduction = "umap.snap", group.by = "predicted.id_CCA"))
plotly::ggplotly(DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_CCA"))
```


### Thoughts
- Conos scores a lot of cells with high confidence, but fails to assign cells to difficult clusters 
- CCA resembles the composition of the RNA data better, but curious that the other methods identify way more 










